#!/usr/bin/perl
# 
# Exercise 9.7
# 
# Extend the restriction map software to take into account the opposite strand for 
# nonpalindromic recognition sites.
#
# Answer to Exercise 9.7

# First: You may already have noticed a certain problem with our previous REBASE code.
# The "bionet" database file often includes more than one entry for a given restriction enzyme,
# to handle different cut sites, and especially for non-palindromic restriction sites.
# So, now is a good place to patch that problem in our code.  We'll change the parseREBASE
# subroutine so that it stores only the rebase-derived cut sites (deferring the translation
# to regular expressions of the cut sites) so that the values of the DBM hash file will now
# be space-separated rebase cut sites.
#
# Second: After doing that, we'll examine the documentation for a better understanding of when we
# need to search for the reverse complement of a given restriction site:
# we'll consult the documentation at http://www.neb.com/rebase/rebhelp.html
# We decide to not search for palindromic restriction sites (since they will appear in
# exactly the same place in the reverse complement), but only non-palindromic sites.
#
# Third: We add the code to search for non-palindromic recognition sites.  We do this
# by calculating the regular expression for the complement (not reverse complement!) of
# the recognition site, and then searching in the sequence.
#
# Fourth: Along the way, we found the following advice (plus more) in the documentation.
# The message here is that our code, from the book and from the exercises, still hasn't
# done a complete job of handling REBASE data.  To attempt to do so would make an interesting
# assignment for a class project.  Remember, there are several more files besides the
# "bionet" file.

## Excerpt from "rebhelp.html"
##                  Copyright (c) Dr. Richard J. Roberts, 2000.
##                             All rights reserved.

################################################################################
## 
## General Help:
## 
## ----------------------------------------------------------------------------
## RECOGNITION SEQUENCE NOMENCLATURE:
## 
## REBASE Recognition sequences representations use the standard abbreviations
## (Eur. J. Biochem. 150: 1-5, 1985) to represent ambiguity:
## 
##                         R = G or A
##                         Y = C or T
##                         M = A or C
##                         K = G or T
##                         S = G or C
##                         W = A or T
##                         B = not A (C or G or T)
##                         D = not C (A or G or T)
##                         H = not G (A or C or T)
##                         V = not T (A or C or G)
##                         N = A or C or G or T
## 
## These are written from 5' to 3', only one strand being given. If the point
## of cleavage has been determined, the precise site is marked with ^. For
## enzymes such as HgaI, MboII etc., which cleave away from their recognition
## sequence the cleavage sites are indicated in parentheses.
## 
## For example HgaI GACGC (5/10) indicates cleavage as follows:
## 
##                         5' GACGCNNNNN^      3'
##                         3' CTGCGNNNNNNNNNN^ 5'
## 
## Typically, the recognition sequences are oriented so that the cleavage sites
## lie on their 3' side.
## ----------------------------------------------------------------------------
## NOTE:
## 
## Homing endonucleases do not really have recognition sequences in the way
## that restriction enzymes do. The recognition sequence listed is one site
## that is known to be recognized and cleaved. In general, single base changes
## merely change the efficiency of cleavage and the precise boundary of
## required bases is not known.
## 
## For putative enzymes, the recognition sequences are predicted.
## ----------------------------------------------------------------------------
## ENZYMES WITH UNUSUAL CLEAVAGE PROPERTIES:
## 
## Enzymes that cut on both sides of their recognition sequences, such as BcgI,
## Bsp24I, CjeI and CjePI, have 4 cleavage sites each instead of 2.
## 
##       Bsp24I
##                 5'      ^NNNNNNNNGACNNNNNNTGGNNNNNNNNNNNN^   3'
##                 3' ^NNNNNNNNNNNNNCTGNNNNNNACCNNNNNNN^        5'
## 
## This will be described in some REBASE reports as:
## 
##                        Bsp24I (8/13)GACNNNNNNTGG(12/7)
## ----------------------------------------------------------------------------
## METHYLATION SITES:
## 
## The site of methylation by the cognate methylase when known is indicated
## X(Y) or X,X2(Y,Y2), where X is the base within the recognition sequence that
## is modified. A negative number indicates the complementary strand, numbered
## from the 5' base of that strand, and Y is the specific type of methylation
## involved:
## 
##                         (6) = N6-methyladenosine
##                         (5) = 5-methylcytosine
##                         (4) = N4-methylcytosine
## 
## If the methylation information is different for the 3' strand, X2 and Y2 are
## given as well.
## ----------------------------------------------------------------------------
## TERMS:
##    Prototype............The first enzyme to have been discovered with each
##                              known specificity.
##  Isoschizomers......Enzymes that recognize the same pattern, and cut at the
##                                 same bases.
##    Neoschizomers......Enzymes that recognize the same pattern, but cut at
##                               different bases.
## ----------------------------------------------------------------------------
## ----------------------------------------------------------------------------


# Now, on with our solution to exercise 9.7

# Modify Exercise 9.3

use strict;
use warnings;
use BeginPerlBioinfo;     # see Chapter 6 about this module

# Declare and initialize variables
my %rebase_hash = ();

unless(dbmopen(%rebase_hash, 'DBNAME', 0644)) {

	print "Cannot open DBM file DBNAME with mode 0644\n";
}

my @file_data = ();
my $query = '';
my $dna = '';
my @recognition_sites = ();
my $recognition_site = '';
my $regexp = '';
my @locations = ();


# If there is a command-line argument, assume it's DNA
if(@ARGV) {
    $dna = $ARGV[0];

# Otherwise, prompt for a FASTA file to open
}else{

    print "Input a FASTA filename: ";
    my $filename = <STDIN>;
    chomp $filename;

    unless(-e $filename) {
    	print "$filename does not exist!\n";
	exit;
    }
    # Read in the file 
    @file_data = get_file_data($filename);

    # Extract the DNA sequence data from the contents of the FASTA file
    $dna = extract_sequence_from_fasta_data(@file_data);
}

# Get the REBASE data into a hash, from file "bionet.110"
# !!!!!!!!Note the new version of parseREBASE!!!!!!!!!!
%rebase_hash = parseREBASE2('bionet.110');


# Prompt user for restriction enzyme names, create restriction map
do {
    print "Search for what restriction site (or quit)?: ";
    
    $query = <STDIN>;

    chomp $query;

    # Exit if empty query
    if ($query =~ /^\s*$/ ) {

        exit;
    }

    # Perform the search in the DNA sequence,
    # translating the recognition sites to regular expressions

    if ( exists $rebase_hash{$query} ) {

        @recognition_sites = split ( " ", $rebase_hash{$query});

	foreach $recognition_site (@recognition_sites) {

            $regexp = IUB_to_regexp($recognition_site);

            # Create the restriction map
            # Store these positions with a leading + sign, like "+324"
            push @locations, map($_ = "+$_", match_positions($regexp, $dna));

	    # For non-palindromic recognition sites,
	    # Search for the complement in the sequence
	    if($recognition_site ne revcomIUB($recognition_site)) {

		# Calculate the regular expression for the complement
		(my $complement = $recognition_site)
                  =~ tr/ACGTRYMKSWBDHVNacgtrymkswbdhvn/TGCAYRKMWSVHDBNtgcayrkmwsvhdbn/;
	        my $regularexpressioncom = IUB_to_regexp($complement);
    
		# Get the matching positions for the complement
		# Store these in @positions with a leading - sign, like "-324"
            	push @locations, map($_ = "-$_", match_positions($regularexpressioncom, $dna));
	    }

	    # Sort the locations, ignoring the leading + and - signs
	    @locations =
	      sort {my($A,$B); ($A=$a) =~ s/^.//; ($B=$b) =~ s/^.//; $A <=> $B; } @locations;
    
            # Report the restriction map to the user
            if (@locations) {
                print "Searching for $query $recognition_site $regexp\n";
                print "A restriction site for $query at locations:\n";
                print join(" ", @locations), "\n";
            } else {
                print "A restriction enzyme $query is not in the DNA:\n";
            }
	}
    }
    print "\n";
} until ( $query =~ /quit/ );

exit;

################################################################################
# Subroutines
################################################################################


################################################################################
# parseREBASE2 - Parse REBASE bionet file, version 2
#
# A subroutine to return a hash where
#    key   = restriction enzyme name
#    value = whitespace-separated recognition sites
#    (the regular expressions will be calculated on the fly elsewhere)
#
# Version 2 handles multiple definition lines for an enzyme name
# Version 2 also handles alternate enzyme names on a line

sub parseREBASE2 {

    my($rebasefile) = @_;

    use strict;
    use warnings;
    use BeginPerlBioinfo;     # see Chapter 6 about this module

    # Declare variables
    my @rebasefile = (  );
    my %rebase_hash = (  );
    my $site;
    my $regexp;

    # Read in the REBASE file
    my $rebase_filehandle = open_file($rebasefile);

    while(<$rebase_filehandle>) {

        my @names = ();

        # Discard header lines
        ( 1 .. /Rich Roberts/ ) and next;

        # Discard blank lines
        /^\s*$/ and next;
    
        # Split the two (or three if includes parenthesized name) fields
        my @fields = split( " ", $_);

        # Get and store the recognition site
        $site = pop @fields;
	# For the purposes of this exercise, we'll ignore cut sites (^).
	# This is not something you'd want to do in general, however!
	$site =~ s/\^//g;

        # Get and store the name and the recognition site.
        # Add alternate (parenthesized) names
        # from the middle field, if any
	foreach my $name (@fields) {

	    if($name =~ /\(.*\)/) {
	        $name =~ s/\((.*)\)/$1/;
	    }
	    push @names, $name;
	}

        # Store the data into the hash, avoiding duplicates (ignoring ^ cut sites)
	# and ignoring reverse complements
	foreach my $name (@names) {

	    # Add new enzyme definition
	    if(not defined $rebase_hash{$name}) {
                $rebase_hash{$name} = "$site";
		next;
	    }

            my(@defined_sites) = split(" ", $rebase_hash{$name});

	    # Omit already defined sites
	    if(grep {$site eq $_} @defined_sites) {
		next;
	    # Omit reverse complements of already defined sites
	    }elsif(grep {revcomIUB($site) eq $_} @defined_sites) {
	        next;
	    # Add the additional site
 	    }else{
	        $rebase_hash{$name} .= " $site";
	    }
	}
    }

    # Return the hash containing the reformatted REBASE data
    return %rebase_hash;
}

sub revcomIUB {
    my($seq) = @_;

    my $revcom = reverse complementIUB($seq);

    return $revcom;
}

sub complementIUB {
    my($seq) = @_;

    (my $com = $seq) =~ tr [ACGTRYMKSWBDHVNacgtrymkswbdhvn]
                              [TGCAYRKMWSVHDBNtgcayrkmwsvhdbn];

    return $com;
}
